In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
data = pd.read_excel("C:/Users/Topsheed/Desktop/HotelDataset.xlsx")
In [3]:
data
Out[3]:
Booking ID Date of Booking Time Customer ID Gender Age Origin Country State Location Destination Country ... Check-Out Date Rooms Hotel Name Hotel Rating Payment Mode Bank Name Booking Price[SGD] Discount GST Profit Margin
0 DDMY00001 2010-01-01 10:49:40 MY00001 Male 42 Malaysia Johor Iskandar Puteri Denmark ... 2010-01-20 1 Hotel Triton 4.3 Wallet United Overseas Bank (UOB) 243 0.01 0.07 0.25
1 DDID00002 2010-01-01 09:19:47 ID00001 Female 44 Indonesia Ciawi West Java Colombia ... 2010-01-22 2 Enchanted Isle 4.2 Wallet EZ-Link 312 0.00 0.07 0.24
2 DDSG00003 2010-01-01 11:52:56 SG00001 Female 31 Singapore Central Rochor Germany ... 2010-01-09 2 Seacoast Hotel 4.5 Credit Card Grab 338 0.19 0.07 0.20
3 DDSG00004 2010-01-01 13:44:40 SG00002 Male 28 Singapore North Yishun Canada ... 2010-02-19 2 Night In Paradise 4.2 Debit Card DBS Paylah 254 0.19 0.07 0.13
4 DDKH00005 2010-01-01 05:38:26 KH00001 Male 44 Cambodia Phnom Trop Pursat Kenya ... 2010-01-06 3 Tiny Digs Hotel 4.6 Wallet Bank of Singapore (BOS) 313 0.15 0.07 0.17
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
66536 DDSG66531 2019-12-31 23:36:16 SG12034 Female 42 Singapore Central Orchard Germany ... 2020-01-10 2 Silver Cloud Inn 4.3 Wallet Grab 182 0.23 0.07 0.24
66537 DDSG66532 2019-12-31 14:41:01 SG12035 Female 54 Singapore Central Geylang Israel ... 2020-04-13 2 The Elet 4.2 COD DBS Paylah 125 0.06 0.07 0.19
66538 DDSG66533 2019-12-31 19:11:16 SG12036 Female 57 Singapore Central Downtown Core Canada ... 2020-01-10 4 The Elet 4.4 Debit Card EZ-Link 318 0.02 0.07 0.22
66539 DDTH66534 2019-12-31 05:12:29 TH12170 Female 44 Thailand Surat Thani Ko Samui Maldives ... 2020-01-02 2 Sunset Lodge 4.2 Debit Card HSBC 173 0.14 0.07 0.25
66540 DDVN66535 2019-12-31 00:51:52 VN05959 Female 52 Vietnam Pleiku Gia Lai Egypt ... 2020-01-28 3 Coastal bay hotel 4.3 Internet Banking Grab 182 0.17 0.07 0.24

66541 rows × 24 columns

In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66541 entries, 0 to 66540
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Booking ID           66541 non-null  object        
 1   Date of Booking      66541 non-null  datetime64[ns]
 2   Time                 66541 non-null  object        
 3   Customer ID          66541 non-null  object        
 4   Gender               66541 non-null  object        
 5   Age                  66541 non-null  int64         
 6   Origin Country       66541 non-null  object        
 7   State                66541 non-null  object        
 8   Location             66541 non-null  object        
 9   Destination Country  66541 non-null  object        
 10  Destination City     66541 non-null  object        
 11  No. Of People        66541 non-null  int64         
 12  Check-in date        66541 non-null  datetime64[ns]
 13  No of Days           0 non-null      float64       
 14  Check-Out Date       66541 non-null  datetime64[ns]
 15  Rooms                66541 non-null  int64         
 16  Hotel Name           66541 non-null  object        
 17  Hotel Rating         66541 non-null  float64       
 18  Payment Mode         66541 non-null  object        
 19  Bank Name            66541 non-null  object        
 20  Booking Price[SGD]   66541 non-null  int64         
 21  Discount             66541 non-null  float64       
 22  GST                  66541 non-null  float64       
 23  Profit Margin        66541 non-null  float64       
dtypes: datetime64[ns](3), float64(5), int64(4), object(12)
memory usage: 12.2+ MB
In [5]:
data.isnull().sum()
Out[5]:
Booking ID                 0
Date of Booking            0
Time                       0
Customer ID                0
Gender                     0
Age                        0
Origin Country             0
State                      0
Location                   0
Destination Country        0
Destination City           0
No. Of People              0
Check-in date              0
No of Days             66541
Check-Out Date             0
Rooms                      0
Hotel Name                 0
Hotel Rating               0
Payment Mode               0
Bank Name                  0
Booking Price[SGD]         0
Discount                   0
GST                        0
Profit Margin              0
dtype: int64

data.columns

In [6]:
data.nunique()
Out[6]:
Booking ID             66536
Date of Booking         3652
Time                   39308
Customer ID            66536
Gender                     2
Age                       41
Origin Country             7
State                    219
Location                 228
Destination Country       20
Destination City         120
No. Of People              7
Check-in date           3747
No of Days                 0
Check-Out Date          3748
Rooms                      4
Hotel Name               614
Hotel Rating              11
Payment Mode               5
Bank Name                  9
Booking Price[SGD]       491
Discount                  26
GST                        1
Profit Margin             17
dtype: int64
In [7]:
data.columns
Out[7]:
Index(['Booking ID', 'Date of Booking', 'Time', 'Customer ID', 'Gender', 'Age',
       'Origin Country', 'State', 'Location', 'Destination Country',
       'Destination City', 'No. Of People', 'Check-in date', 'No of Days',
       'Check-Out Date', 'Rooms', 'Hotel Name', 'Hotel Rating', 'Payment Mode',
       'Bank Name', 'Booking Price[SGD]', 'Discount', 'GST', 'Profit Margin'],
      dtype='object')
In [8]:
data['Profit Margin'].sum()
Out[8]:
12564.040000000003
In [9]:
data1 = data.drop(['Booking ID','Customer ID','GST','State','Location','Bank Name','No of Days',], axis = 1)

Observation:¶

data1 is the new data after droping of some of the unneeded data variables

Observation¶

  • Feauture Engineering
In [10]:
data1
Out[10]:
Date of Booking Time Gender Age Origin Country Destination Country Destination City No. Of People Check-in date Check-Out Date Rooms Hotel Name Hotel Rating Payment Mode Booking Price[SGD] Discount Profit Margin
0 2010-01-01 10:49:40 Male 42 Malaysia Denmark Horsens 1 2010-01-12 2010-01-20 1 Hotel Triton 4.3 Wallet 243 0.01 0.25
1 2010-01-01 09:19:47 Female 44 Indonesia Colombia Medellin 3 2010-01-21 2010-01-22 2 Enchanted Isle 4.2 Wallet 312 0.00 0.24
2 2010-01-01 11:52:56 Female 31 Singapore Germany Munich 3 2010-01-02 2010-01-09 2 Seacoast Hotel 4.5 Credit Card 338 0.19 0.20
3 2010-01-01 13:44:40 Male 28 Singapore Canada Montreal 3 2010-02-15 2010-02-19 2 Night In Paradise 4.2 Debit Card 254 0.19 0.13
4 2010-01-01 05:38:26 Male 44 Cambodia Kenya Nairobi 5 2010-01-03 2010-01-06 3 Tiny Digs Hotel 4.6 Wallet 313 0.15 0.17
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
66536 2019-12-31 23:36:16 Female 42 Singapore Germany Berlin 4 2020-01-06 2020-01-10 2 Silver Cloud Inn 4.3 Wallet 182 0.23 0.24
66537 2019-12-31 14:41:01 Female 54 Singapore Israel Holon 4 2020-04-09 2020-04-13 2 The Elet 4.2 COD 125 0.06 0.19
66538 2019-12-31 19:11:16 Female 57 Singapore Canada Ottawa 7 2020-01-09 2020-01-10 4 The Elet 4.4 Debit Card 318 0.02 0.22
66539 2019-12-31 05:12:29 Female 44 Thailand Maldives Viligili 3 2020-01-01 2020-01-02 2 Sunset Lodge 4.2 Debit Card 173 0.14 0.25
66540 2019-12-31 00:51:52 Female 52 Vietnam Egypt Luxor 5 2020-01-24 2020-01-28 3 Coastal bay hotel 4.3 Internet Banking 182 0.17 0.24

66541 rows × 17 columns

Observation¶

  • I Observe we dont real need some of the variable in the data
  • so i decided to drop some of the un needed variables : Customer ID, No. of days, Bank Name and others
In [11]:
data1.columns = data1.columns.str.replace (' ','_') # Replacing the space (s) in the column names with underscore
data
Out[11]:
Booking ID Date of Booking Time Customer ID Gender Age Origin Country State Location Destination Country ... Check-Out Date Rooms Hotel Name Hotel Rating Payment Mode Bank Name Booking Price[SGD] Discount GST Profit Margin
0 DDMY00001 2010-01-01 10:49:40 MY00001 Male 42 Malaysia Johor Iskandar Puteri Denmark ... 2010-01-20 1 Hotel Triton 4.3 Wallet United Overseas Bank (UOB) 243 0.01 0.07 0.25
1 DDID00002 2010-01-01 09:19:47 ID00001 Female 44 Indonesia Ciawi West Java Colombia ... 2010-01-22 2 Enchanted Isle 4.2 Wallet EZ-Link 312 0.00 0.07 0.24
2 DDSG00003 2010-01-01 11:52:56 SG00001 Female 31 Singapore Central Rochor Germany ... 2010-01-09 2 Seacoast Hotel 4.5 Credit Card Grab 338 0.19 0.07 0.20
3 DDSG00004 2010-01-01 13:44:40 SG00002 Male 28 Singapore North Yishun Canada ... 2010-02-19 2 Night In Paradise 4.2 Debit Card DBS Paylah 254 0.19 0.07 0.13
4 DDKH00005 2010-01-01 05:38:26 KH00001 Male 44 Cambodia Phnom Trop Pursat Kenya ... 2010-01-06 3 Tiny Digs Hotel 4.6 Wallet Bank of Singapore (BOS) 313 0.15 0.07 0.17
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
66536 DDSG66531 2019-12-31 23:36:16 SG12034 Female 42 Singapore Central Orchard Germany ... 2020-01-10 2 Silver Cloud Inn 4.3 Wallet Grab 182 0.23 0.07 0.24
66537 DDSG66532 2019-12-31 14:41:01 SG12035 Female 54 Singapore Central Geylang Israel ... 2020-04-13 2 The Elet 4.2 COD DBS Paylah 125 0.06 0.07 0.19
66538 DDSG66533 2019-12-31 19:11:16 SG12036 Female 57 Singapore Central Downtown Core Canada ... 2020-01-10 4 The Elet 4.4 Debit Card EZ-Link 318 0.02 0.07 0.22
66539 DDTH66534 2019-12-31 05:12:29 TH12170 Female 44 Thailand Surat Thani Ko Samui Maldives ... 2020-01-02 2 Sunset Lodge 4.2 Debit Card HSBC 173 0.14 0.07 0.25
66540 DDVN66535 2019-12-31 00:51:52 VN05959 Female 52 Vietnam Pleiku Gia Lai Egypt ... 2020-01-28 3 Coastal bay hotel 4.3 Internet Banking Grab 182 0.17 0.07 0.24

66541 rows × 24 columns

Observation:¶

  • Convert date columns to datetime objects
  • & Create number of days spent column by subtracting booking date from checkin date
In [12]:
# Convert date columns to datetime objects

data1['Check-Out_Date'] = pd.to_datetime(data1['Check-Out_Date'])

data1['Check-in_date'] = pd.to_datetime(data1['Check-in_date'])

# Create number of days spent column by subtracting booking date from checkin date

data1['No_of_Days'] = data1['Check-Out_Date'] - data1['Check-in_date']


#print new data1

data1.head()
Out[12]:
Date_of_Booking Time Gender Age Origin_Country Destination_Country Destination_City No._Of_People Check-in_date Check-Out_Date Rooms Hotel_Name Hotel_Rating Payment_Mode Booking_Price[SGD] Discount Profit_Margin No_of_Days
0 2010-01-01 10:49:40 Male 42 Malaysia Denmark Horsens 1 2010-01-12 2010-01-20 1 Hotel Triton 4.3 Wallet 243 0.01 0.25 8 days
1 2010-01-01 09:19:47 Female 44 Indonesia Colombia Medellin 3 2010-01-21 2010-01-22 2 Enchanted Isle 4.2 Wallet 312 0.00 0.24 1 days
2 2010-01-01 11:52:56 Female 31 Singapore Germany Munich 3 2010-01-02 2010-01-09 2 Seacoast Hotel 4.5 Credit Card 338 0.19 0.20 7 days
3 2010-01-01 13:44:40 Male 28 Singapore Canada Montreal 3 2010-02-15 2010-02-19 2 Night In Paradise 4.2 Debit Card 254 0.19 0.13 4 days
4 2010-01-01 05:38:26 Male 44 Cambodia Kenya Nairobi 5 2010-01-03 2010-01-06 3 Tiny Digs Hotel 4.6 Wallet 313 0.15 0.17 3 days

Observation:¶

  • Trying to change the format of the Booking Date to (Str) type & Spliting the BookingYear, BookingMonth, & the BookingDay
In [13]:
#Converting a timestamp column into string so we can extract 'Year as BookingYear', BookingMonth as Month

df=data1['Date_of_Booking'].astype(str)

#extract Year, Month, Date

data1['BookingYear'] =df.apply(lambda x: x.split("-")[0])
data1['BookingMonth'] =df.apply(lambda x: x.split("-")[1])
data1['BookingDay'] =df.apply(lambda x: x.split("-")[2])

#print our new data1
data1
Out[13]:
Date_of_Booking Time Gender Age Origin_Country Destination_Country Destination_City No._Of_People Check-in_date Check-Out_Date ... Hotel_Name Hotel_Rating Payment_Mode Booking_Price[SGD] Discount Profit_Margin No_of_Days BookingYear BookingMonth BookingDay
0 2010-01-01 10:49:40 Male 42 Malaysia Denmark Horsens 1 2010-01-12 2010-01-20 ... Hotel Triton 4.3 Wallet 243 0.01 0.25 8 days 2010 01 01
1 2010-01-01 09:19:47 Female 44 Indonesia Colombia Medellin 3 2010-01-21 2010-01-22 ... Enchanted Isle 4.2 Wallet 312 0.00 0.24 1 days 2010 01 01
2 2010-01-01 11:52:56 Female 31 Singapore Germany Munich 3 2010-01-02 2010-01-09 ... Seacoast Hotel 4.5 Credit Card 338 0.19 0.20 7 days 2010 01 01
3 2010-01-01 13:44:40 Male 28 Singapore Canada Montreal 3 2010-02-15 2010-02-19 ... Night In Paradise 4.2 Debit Card 254 0.19 0.13 4 days 2010 01 01
4 2010-01-01 05:38:26 Male 44 Cambodia Kenya Nairobi 5 2010-01-03 2010-01-06 ... Tiny Digs Hotel 4.6 Wallet 313 0.15 0.17 3 days 2010 01 01
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
66536 2019-12-31 23:36:16 Female 42 Singapore Germany Berlin 4 2020-01-06 2020-01-10 ... Silver Cloud Inn 4.3 Wallet 182 0.23 0.24 4 days 2019 12 31
66537 2019-12-31 14:41:01 Female 54 Singapore Israel Holon 4 2020-04-09 2020-04-13 ... The Elet 4.2 COD 125 0.06 0.19 4 days 2019 12 31
66538 2019-12-31 19:11:16 Female 57 Singapore Canada Ottawa 7 2020-01-09 2020-01-10 ... The Elet 4.4 Debit Card 318 0.02 0.22 1 days 2019 12 31
66539 2019-12-31 05:12:29 Female 44 Thailand Maldives Viligili 3 2020-01-01 2020-01-02 ... Sunset Lodge 4.2 Debit Card 173 0.14 0.25 1 days 2019 12 31
66540 2019-12-31 00:51:52 Female 52 Vietnam Egypt Luxor 5 2020-01-24 2020-01-28 ... Coastal bay hotel 4.3 Internet Banking 182 0.17 0.24 4 days 2019 12 31

66541 rows × 21 columns

Observation:¶

  • Trying to get the time between the Booking Date & the Check in Date Calling it: Lead Time
In [14]:
# Convert Lead time columns by substracting Date_of_Booking from Check_in_Date


data1['Lead_time'] = data1['Check-in_date'] - data1['Date_of_Booking']


#print new data1

data1.head()
Out[14]:
Date_of_Booking Time Gender Age Origin_Country Destination_Country Destination_City No._Of_People Check-in_date Check-Out_Date ... Hotel_Rating Payment_Mode Booking_Price[SGD] Discount Profit_Margin No_of_Days BookingYear BookingMonth BookingDay Lead_time
0 2010-01-01 10:49:40 Male 42 Malaysia Denmark Horsens 1 2010-01-12 2010-01-20 ... 4.3 Wallet 243 0.01 0.25 8 days 2010 01 01 11 days
1 2010-01-01 09:19:47 Female 44 Indonesia Colombia Medellin 3 2010-01-21 2010-01-22 ... 4.2 Wallet 312 0.00 0.24 1 days 2010 01 01 20 days
2 2010-01-01 11:52:56 Female 31 Singapore Germany Munich 3 2010-01-02 2010-01-09 ... 4.5 Credit Card 338 0.19 0.20 7 days 2010 01 01 1 days
3 2010-01-01 13:44:40 Male 28 Singapore Canada Montreal 3 2010-02-15 2010-02-19 ... 4.2 Debit Card 254 0.19 0.13 4 days 2010 01 01 45 days
4 2010-01-01 05:38:26 Male 44 Cambodia Kenya Nairobi 5 2010-01-03 2010-01-06 ... 4.6 Wallet 313 0.15 0.17 3 days 2010 01 01 2 days

5 rows × 22 columns

Observation:¶

  • we are droping the Date of Booking, Check in Date, and Check out Date
    • this is because we have split the already in the data frame and belief we dont need them again
In [15]:
data1 = data1.drop(['Date_of_Booking','Check-in_date','Check-Out_Date'], axis = 1)
In [16]:
data1
Out[16]:
Time Gender Age Origin_Country Destination_Country Destination_City No._Of_People Rooms Hotel_Name Hotel_Rating Payment_Mode Booking_Price[SGD] Discount Profit_Margin No_of_Days BookingYear BookingMonth BookingDay Lead_time
0 10:49:40 Male 42 Malaysia Denmark Horsens 1 1 Hotel Triton 4.3 Wallet 243 0.01 0.25 8 days 2010 01 01 11 days
1 09:19:47 Female 44 Indonesia Colombia Medellin 3 2 Enchanted Isle 4.2 Wallet 312 0.00 0.24 1 days 2010 01 01 20 days
2 11:52:56 Female 31 Singapore Germany Munich 3 2 Seacoast Hotel 4.5 Credit Card 338 0.19 0.20 7 days 2010 01 01 1 days
3 13:44:40 Male 28 Singapore Canada Montreal 3 2 Night In Paradise 4.2 Debit Card 254 0.19 0.13 4 days 2010 01 01 45 days
4 05:38:26 Male 44 Cambodia Kenya Nairobi 5 3 Tiny Digs Hotel 4.6 Wallet 313 0.15 0.17 3 days 2010 01 01 2 days
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
66536 23:36:16 Female 42 Singapore Germany Berlin 4 2 Silver Cloud Inn 4.3 Wallet 182 0.23 0.24 4 days 2019 12 31 6 days
66537 14:41:01 Female 54 Singapore Israel Holon 4 2 The Elet 4.2 COD 125 0.06 0.19 4 days 2019 12 31 100 days
66538 19:11:16 Female 57 Singapore Canada Ottawa 7 4 The Elet 4.4 Debit Card 318 0.02 0.22 1 days 2019 12 31 9 days
66539 05:12:29 Female 44 Thailand Maldives Viligili 3 2 Sunset Lodge 4.2 Debit Card 173 0.14 0.25 1 days 2019 12 31 1 days
66540 00:51:52 Female 52 Vietnam Egypt Luxor 5 3 Coastal bay hotel 4.3 Internet Banking 182 0.17 0.24 4 days 2019 12 31 24 days

66541 rows × 19 columns

Eploratoty Data Analisis [EDA]¶

In [17]:
data1.describe(include='all').T
Out[17]:
count unique top freq mean std min 25% 50% 75% max
Time 66541 39308 09:48:22 14 NaN NaN NaN NaN NaN NaN NaN
Gender 66541 2 Female 33388 NaN NaN NaN NaN NaN NaN NaN
Age 66541.0 NaN NaN NaN 38.436182 11.557562 -5.0 28.0 38.0 48.0 58.0
Origin_Country 66541 7 Thailand 12170 NaN NaN NaN NaN NaN NaN NaN
Destination_Country 66541 20 New Zealand 3448 NaN NaN NaN NaN NaN NaN NaN
Destination_City 66541 120 Hamilton 1084 NaN NaN NaN NaN NaN NaN NaN
No._Of_People 66541.0 NaN NaN NaN 3.995777 2.000074 1.0 2.0 4.0 6.0 7.0
Rooms 66541.0 NaN NaN NaN 2.283344 1.031134 1.0 1.0 2.0 3.0 4.0
Hotel_Name 66541 614 Grand Hyatt 221 NaN NaN NaN NaN NaN NaN NaN
Hotel_Rating 66541.0 NaN NaN NaN 4.2885 0.275516 3.3 4.2 4.3 4.5 4.7
Payment_Mode 66541 5 Internet Banking 13446 NaN NaN NaN NaN NaN NaN NaN
Booking_Price[SGD] 66541.0 NaN NaN NaN 214.046783 108.126184 35.0 125.0 199.0 293.0 578.0
Discount 66541.0 NaN NaN NaN 0.12547 0.075113 0.0 0.06 0.13 0.19 0.25
Profit_Margin 66541.0 NaN NaN NaN 0.188817 0.052765 0.1 0.14 0.2 0.23 0.3
No_of_Days 66541 NaN NaN NaN 3 days 05:36:37.352008536 3 days 09:04:06.621354014 1 days 00:00:00 1 days 00:00:00 2 days 00:00:00 4 days 00:00:00 97 days 00:00:00
BookingYear 66541 10 2019 9502 NaN NaN NaN NaN NaN NaN NaN
BookingMonth 66541 12 12 5747 NaN NaN NaN NaN NaN NaN NaN
BookingDay 66541 31 25 2255 NaN NaN NaN NaN NaN NaN NaN
Lead_time 66541 NaN NaN NaN 28 days 12:29:07.061210381 30 days 05:02:51.843807522 1 days 00:00:00 4 days 00:00:00 15 days 00:00:00 51 days 00:00:00 100 days 00:00:00
In [18]:
# seprating numerical and catigorical variables for easy analysis
cat_cols = data1.select_dtypes(include=['object']).columns
num_cols = data1.select_dtypes(include=np.number).columns.tolist()
print(cat_cols)
print("Numerical Variables: ")
print(num_cols)
Index(['Time', 'Gender', 'Origin_Country', 'Destination_Country',
       'Destination_City', 'Hotel_Name', 'Payment_Mode', 'BookingYear',
       'BookingMonth', 'BookingDay'],
      dtype='object')
Numerical Variables: 
['Age', 'No._Of_People', 'Rooms', 'Hotel_Rating', 'Booking_Price[SGD]', 'Discount', 'Profit_Margin', 'No_of_Days', 'Lead_time']

EDA Univariate Analysis¶

In [19]:
for col in num_cols:

    print(col)

    print('Skew :', round(data1[col].skew(), 2))

    plt.figure(figsize = (15, 4))

    plt.subplot(1, 2, 1)

    data1[col].hist(grid=False)

    plt.ylabel('count')

    plt.subplot(1, 2, 2)

    sns.boxplot(x=data1[col])

    plt.show()
Age
Skew : 0.01
No._Of_People
Skew : 0.01
Rooms
Skew : 0.2
Hotel_Rating
Skew : -1.17
Booking_Price[SGD]
Skew : 0.39
Discount
Skew : -0.01
Profit_Margin
Skew : 0.08
No_of_Days
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[19], line 5
      1 for col in num_cols:
      3     print(col)
----> 5     print('Skew :', round(data1[col].skew(), 2))
      7     plt.figure(figsize = (15, 4))
      9     plt.subplot(1, 2, 1)

File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:11577, in NDFrame._add_numeric_operations.<locals>.skew(self, axis, skipna, numeric_only, **kwargs)
  11560 @doc(
  11561     _num_doc,
  11562     desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.",
   (...)
  11575     **kwargs,
  11576 ):
> 11577     return NDFrame.skew(self, axis, skipna, numeric_only, **kwargs)

File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:11223, in NDFrame.skew(self, axis, skipna, numeric_only, **kwargs)
  11216 def skew(
  11217     self,
  11218     axis: Axis | None = 0,
   (...)
  11221     **kwargs,
  11222 ) -> Series | float:
> 11223     return self._stat_function(
  11224         "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs
  11225     )

File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:11158, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11154     nv.validate_stat_func((), kwargs, fname=name)
  11156 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11158 return self._reduce(
  11159     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11160 )

File ~\anaconda3\Lib\site-packages\pandas\core\series.py:4656, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   4652     self._get_axis_number(axis)
   4654 if isinstance(delegate, ExtensionArray):
   4655     # dispatch to ExtensionArray interface
-> 4656     return delegate._reduce(name, skipna=skipna, **kwds)
   4658 else:
   4659     # dispatch to numpy arrays
   4660     if numeric_only and not is_numeric_dtype(self.dtype):

File ~\anaconda3\Lib\site-packages\pandas\core\arrays\base.py:1440, in ExtensionArray._reduce(self, name, skipna, **kwargs)
   1438 meth = getattr(self, name, None)
   1439 if meth is None:
-> 1440     raise TypeError(
   1441         f"'{type(self).__name__}' with dtype {self.dtype} "
   1442         f"does not support reduction '{name}'"
   1443     )
   1444 return meth(skipna=skipna, **kwargs)

TypeError: 'TimedeltaArray' with dtype timedelta64[ns] does not support reduction 'skew'

Data/Log Transformation¶

  • this is done to correct the skewness
In [ ]:
# Function for log transformation of the column

def log_transform(data,col):

    for colname in col:

        if (data[colname] == 1.0).all():

            data[colname + '_log'] = np.log(data[colname]+1)

        else:

            data1[colname + '_log'] = np.log(data1[colname])

log_transform(data1,['Hotel_Rating'])

#Log transformation of the feature 'Hotel_Rating'

sns.distplot(data1["Hotel_Rating_log"], axlabel="Hotel_Rating_log");

plt.show()
In [ ]:
data1
In [83]:
fig, axes = plt.subplots(5, 2, figsize = (100, 150))

sns.set_style('whitegrid')


fig.suptitle('Bar plot for all categorical variables in the dataset')

sns.countplot(ax = axes[0, 0], x = 'Payment_Mode', data = data1, palette = 'RdBu_r', 

              order = data1['Payment_Mode'].value_counts().head(20).index);


sns.countplot(ax = axes[0, 1], x = 'BookingDay', data = data1, palette = 'rainbow',

             order = data1['BookingDay'].value_counts().head(20).index);


sns.countplot(ax = axes[1, 0], x = 'Gender', data = data1, palette="Set1",

             order = data1['Gender'].value_counts().head(20).index);


sns.countplot(ax = axes[1, 1], x = 'Hotel_Name', data = data1, palette="PuOr", 

              order = data1['Hotel_Name'].value_counts().head(20).index);


sns.countplot(ax = axes[2, 0], x = 'Origin_Country', data = data1, color = 'blue',

             order = data1['Origin_Country'].value_counts().head(20).index);


sns.countplot(ax = axes[2, 1], x = 'Destination_Country', data = data1, palette="RdBu",

             order = data1['Destination_Country'].value_counts().head(20).index);


sns.countplot(ax = axes[3, 0], x = 'BookingYear', data = data1, color = 'red', 

              order = data1['BookingYear'].value_counts().head(20).index);


sns.countplot(ax = axes[3, 1], x = 'BookingMonth', data = data1, palette="Set2",

             order = data1['BookingMonth'].value_counts().head(20).index);



sns.countplot(ax = axes[4, 0], x = 'Destination_City', data = data1, color = 'red', 

              order = data1['Destination_City'].value_counts().head(20).index);


sns.countplot(ax = axes[4, 1], x = 'Time', data = data1, palette="Set2",

             order = data1['Time'].value_counts().head(20).index);


axes[1][1].tick_params(labelrotation=90);

axes[2][0].tick_params(labelrotation=90);

axes[2][1].tick_params(labelrotation=90);

axes[4][0].tick_params(labelrotation=90);

axes[4][1].tick_params(labelrotation=90);


plt.rc('font', size=8)

plt.rc('axes', titlesize=100)

plt.rc('axes', labelsize=20)

plt.rc('xtick', labelsize=80)

plt.rc('ytick', labelsize=80)

plt.rc('legend', fontsize=80)

plt.rc('figure', titlesize=100)

plt.subplots_adjust(hspace=1)
plt.subplots_adjust(wspace=.5)
sns.despine()
In [102]:
num_cols
Out[102]:
['Age',
 'No._Of_People',
 'Rooms',
 'Hotel_Rating',
 'Booking_Price[SGD]',
 'Discount',
 'Profit_Margin',
 'No_of_Days',
 'Lead_time']
In [109]:
figsize = (100, 150)

sns.set_style('whitegrid')

ax = sns.countplot(y='BookingDay', hue = 'Gender', data=data1, palette='colorblind', order = data1['BookingDay'].value_counts().head().index)

total = data1['BookingDay'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='BookingDay', title='BookingDay by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)

plt.tight_layout()

plt.show()




figsize = (18, 20)

sns.set_style('whitegrid')

ax = sns.countplot(y='BookingYear', hue = 'Gender', data=data1, palette='Set1', order = data1['BookingYear'].value_counts().head().index)

total = data1['BookingYear'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='BookingYear', title='BookingYear by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)

plt.tight_layout()

plt.show()


figsize = (18, 20)

sns.set_style('whitegrid')

ax = sns.countplot(y='Booking_Price[SGD]', hue = 'Gender', data=data1, palette='Set2', order = data1['Booking_Price[SGD]'].value_counts().head().index)

total = data1['Booking_Price[SGD]'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='Booking_Price[SGD]', title='Booking_Price by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)

plt.tight_layout()

plt.show


figsize = (18, 20)

sns.set_style('whitegrid')

ax = sns.countplot(y='Payment_Mode', hue = 'Gender', data=data1, palette='rainbow', order = data1['Payment_Mode'].value_counts().head().index)

total = data1['Payment_Mode'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='Payment_Mode', title='Payment_Mode by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)

plt.tight_layout()

plt.show()



figsize = (18, 20)

sns.set_style('whitegrid')

ax = sns.countplot(y='Hotel_Rating', hue = 'Gender', data=data1, palette='Set3', order = data1['Hotel_Rating'].value_counts().head().index)

total = data1['Hotel_Rating'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='Hotel_Rating', title='Hotel_Rating by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)

plt.tight_layout()

plt.show()



figsize = (18, 20)

sns.set_style('whitegrid')

ax = sns.countplot(y='Hotel_Name', hue = 'Gender', data=data1, palette='PuOr', order = data1['Hotel_Name'].value_counts().head().index)

total = data1['Hotel_Name'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='Hotel_Name', title='Hotel_Name by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)

plt.tight_layout()

plt.show()


figsize = (18, 20)

sns.set_style('whitegrid')

ax = sns.countplot(y='Destination_Country', hue = 'Gender', data=data1, palette="Paired", order = data1['Destination_Country'].value_counts().head().index)

total = data1['Destination_Country'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='Destination_Country', title='Destination_Country by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)

plt.tight_layout()

plt.show()


figsize = (18, 20)

sns.set_style('whitegrid')

ax = sns.countplot(y='Age', hue = 'Gender', data=data1, palette='PuOr', order = data1['Age'].value_counts().head().index)

total = data1['Age'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='Age', title='Age by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)

plt.tight_layout()

plt.show()


figsize = (18, 20)

sns.set_style('whitegrid')

ax = sns.countplot(y='Origin_Country', hue = 'Gender', data=data1, palette='colorblind', order = data1['Origin_Country'].value_counts().head().index)

total = data1['Origin_Country'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='Origin_Country', title='Origin_Country by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)

plt.tight_layout()

plt.show()


figsize = (18, 20)

sns.set_style('whitegrid')

ax = sns.countplot(y='Lead_time', hue = 'Gender', data=data1, palette='colorblind', order = data1['Lead_time'].value_counts().head().index)

total = data1['Lead_time'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='Lead_time', title='Lead_time by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=8)

plt.tight_layout()

plt.show()
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:15: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:38: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:59: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:80: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:82: UserWarning: The figure layout has changed to tight
  plt.tight_layout()
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:102: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:124: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:145: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:166: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:187: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\2714701457.py:208: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=8)
In [108]:
figsize = (100, 150)

sns.set_style('whitegrid')

ax = sns.countplot(y='BookingDay', hue = 'Gender', data=data1, palette='colorblind', order = data1['BookingDay'].value_counts().head().index)

total = data1['BookingDay'].count()

ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:.1f}%')

ax.margins(x=0.1)

ax.set(xlabel='Count', ylabel='BookingDay', title='BookingDay by Gender')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)

plt.rc('font', size=15)

plt.rc('axes', titlesize=20)

plt.rc('axes', labelsize=35)

plt.rc('xtick', labelsize=20)

plt.rc('ytick', labelsize=20)

plt.rc('legend', fontsize=15)

plt.rc('figure', titlesize=20)

plt.subplots_adjust(hspace=1)
plt.subplots_adjust(wspace=.5)
sns.despine()

plt.tight_layout()

plt.show()
C:\Users\Topsheed\AppData\Local\Temp\ipykernel_2864\4052684459.py:15: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(), fontsize=7)
In [121]:
plt.figure(figsize=(20,30))

sns.pairplot(data1)

plt.show()
C:\Users\Topsheed\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
<Figure size 2000x3000 with 0 Axes>
In [124]:
fig, axarr = plt.subplots(5, 2, figsize=(12, 18))

data1.groupby('Payment_Mode')['Profit_Margin'].mean().sort_values(ascending=False).plot.bar(ax=axarr[0][0], fontsize=12)

axarr[0][0].set_title("Payment_Mode Vs Profit_Margin", fontsize=18)

data1.groupby('Origin_Country')['Profit_Margin'].mean().sort_values(ascending=False).plot.bar(ax=axarr[0][1], fontsize=12)

axarr[0][1].set_title("Origin_Country Vs Profit_Margin", fontsize=18)

data1.groupby('Destination_Country')['Profit_Margin'].mean().sort_values(ascending=False).plot.bar(ax=axarr[1][0], fontsize=12)

axarr[1][0].set_title("Destination_Country Vs Profit_Margin", fontsize=18)

data1.groupby('BookingYear')['Profit_Margin'].mean().sort_values(ascending=False).plot.bar(ax=axarr[1][1], fontsize=12)

axarr[1][1].set_title("BookingYear Vs Profit_Margin", fontsize=18)

data1.groupby('BookingMonth')['Profit_Margin'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[2][0], fontsize=12)

axarr[2][0].set_title("Booking month Vs Profit_Margin", fontsize=18)

data1.groupby('BookingDay')['Profit_Margin'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[2][1], fontsize=12)

axarr[2][1].set_title("Booking day Vs Profit_Margin", fontsize=18)

data1.groupby('Gender')['Profit_Margin'].mean().sort_values(ascending=False).plot.bar(ax=axarr[3][0], fontsize=12)

axarr[3][0].set_title("Gender Vs Profit_Margin", fontsize=18)

data1.groupby('Destination_City')['Profit_Margin'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[3][1], fontsize=12)

axarr[3][1].set_title("Destination city Vs Profit_Margin", fontsize=18)

data1.groupby('Time')['Profit_Margin'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[4][0], fontsize=12)

axarr[4][0].set_title("Time Vs Profit_Margin", fontsize=18)

data1.groupby('Hotel_Name')['Profit_Margin'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[4][1], fontsize=12)

axarr[4][1].set_title("Hotel_Name Vs Profit_Margin", fontsize=18)

plt.subplots_adjust(hspace=1.2)

plt.subplots_adjust(wspace=.5)

sns.despine()

EAD _ Multivariate¶

* Working on Several Variable
In [132]:
plt.figure(figsize=(12, 7))

sns.heatmap(data1.drop(['Time', 'Gender', 'Origin_Country', 'Destination_Country',

       'Destination_City', 'Hotel_Name', 'Payment_Mode', 'BookingYear',

       'BookingMonth', 'BookingDay'],axis=1).corr(), annot = True, vmin = -1, vmax = 1)

plt.show()
In [127]:
data1['Origin_Country'].count()
Out[127]:
66541
In [ ]: